Preparation
Genotyped data set preparation
First we define input and output path shortcuts.
input <- file.path(getwd(), "data", "Input")
output <- file.path(getwd(), "data", "Output")
Then we delete the the output folder and all the files and folders it contains from the previous run.
unlink(output, recursive = TRUE)
The following code chunk consists in the arguments that this R script can take from the user. “verbose” means whether to output to the terminal, “imputated” is if the program should expect an imputated dataset “EP” performs needed data correction before analysis in the Epistasis Project dataset and “ADNI” means if the input dataset also includes the ADNI dataset.
verbose <- FALSE
Then, we proceed to read and store the genotyped input data.
encoded_NA <- c("00", "", "???", "-9")
df <- read.csv2(file.path(input, "genotyped.csv"),
na.strings = encoded_NA)
str(df, list.len = 30)
## 'data.frame': 2462 obs. of 100 variables:
## $ ID : chr "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
## $ Plate : chr "AST5" "AST5" "AST5" "AST5" ...
## $ WELL : chr "B01" "B03" "D01" "B05" ...
## $ CentreCODE : chr "RG 728" "RG 733" "RG 606" "RG 743" ...
## $ Diag : chr "AD" "AD" "AD" "AD" ...
## $ SexLett : chr "Male" "Female" "Male" "Female" ...
## $ Age : int 69 69 81 76 75 75 80 74 71 76 ...
## $ AgeExam : num 69 69 81 76 75 75 80 74 71 76 ...
## $ AgeOnset : num NA NA NA NA NA NA NA NA NA NA ...
## $ AgeDeath : int NA NA NA NA NA NA NA NA NA NA ...
## $ Centre_APOE : int 33 34 33 33 33 33 23 33 23 33 ...
## $ Centre : chr "OVIEDO" "OVIEDO" "OVIEDO" "OVIEDO" ...
## $ rs429358 : chr "TT" "CT" "TT" "TT" ...
## $ rs7412 : chr "CC" "CC" "CC" "CC" ...
## $ LGC_APOE : int 33 34 33 33 33 33 23 33 23 33 ...
## $ LGC_E4. : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FID : chr "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
## $ IID : chr "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
## $ PID : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MID : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sex : int 1 2 1 2 2 1 1 2 2 1 ...
## $ Dx : int 2 2 2 2 2 2 2 2 2 2 ...
## $ SORT1_rs17585355 : chr "AA" "AA" "AA" "AA" ...
## $ SORT1_rs2228604 : chr "CC" "CC" "AC" "AC" ...
## $ SORT1_rs7536292 : chr "TT" "CT" "TT" "TT" ...
## $ SORT1_rs17646665 : chr "AA" "AA" "AA" "AA" ...
## $ SORT1_rs1149175 : chr "GG" "GG" "GG" "GG" ...
## $ SORT1_rs10745354 : chr "TT" "TT" "CT" "CT" ...
## $ IL10_rs1800896 : chr "AA" "GA" "GG" "GA" ...
## $ BIN1_rs744373 : chr "GG" "GA" "GA" "GG" ...
## [list output truncated]
Reading the snps that we want to study
vector_of_gene_snps <- scan(file.path(input, "snps.txt"),
what = "character",
quiet = !verbose)
vector_of_gene_snps
## [1] "TCN2_rs5749131" "TCN2_rs5753231" "TCN2_rs16988828"
## [4] "TCN2_rs9606756" "TCN2_rs757874" "TCN2_rs1801198"
## [7] "TCN2_rs5749135" "TCN2_rs9621049" "TCN2_rs1131603"
## [10] "TCN2_rs7289553" "TCN2_rs5997711" "MTRR_rs1801394"
## [13] "MTRR_rs13181011" "MTRR_rs326120" "MTRR_rs1532268"
## [16] "MTRR_rs7703033" "MTRR_rs6555501" "MTRR_rs162035"
## [19] "MTRR_rs3815743" "MTRR_rs162040" "MTRR_rs10475399"
## [22] "SORT1_rs17585355" "SORT1_rs2228604" "SORT1_rs7536292"
## [25] "SORT1_rs17646665" "SORT1_rs1149175" "SORT1_rs10745354"
## [28] "BDNF_rs6265" "BDNF_rs11030102" "BDNF_rs11030104"
## [31] "BDNF_rs11030119" "BDNF_rs12288512" "SHMT1_rs16961153"
## [34] "SHMT1_rs1979277" "SHMT1_rs669340" "SHMT1_rs643333"
## [37] "SHMT1_rs9901160" "RFC1_rs11096990" "RFC1_rs4975002"
## [40] "RFC1_rs4975009" "RFC1_rs6851075" "APOE_rs429358"
## [43] "APOE_rs7412" "APOE_rs597668" "ABCA1_rs1800977"
## [46] "ABCA1_rs2422493" "CYP46A1_rs7157609" "CYP46A1_rs4900442"
## [49] "DBH_rs1611115" "DBH_rs1611131" "GAB2_rs4945261"
## [52] "GAB2_rs2373115" "MAPT_rs242557" "MAPT_rs2471738"
## [55] "NPC1_rs2236707" "NPC1_rs4800488" "NR1H2_rs2695121"
## [58] "NR1H2_rs1052533" "NTRK2_rs1187323" "NTRK2_rs1545285"
## [61] "PEMT_rs7946" "PEMT_rs12325817" "VDR_rs731236"
## [64] "VDR_rs7975232" "BIN1_rs744373" "CD33_rs3865444"
## [67] "CDK5_rs2069442" "HMDX1_rs2071746" "HMGCR_rs3931914"
## [70] "IL10_rs1800896" "LRP1_rs1799986" "MTHFD1L_rs11754661"
## [73] "PRNP_rs1799990" "SLC19A1_rs1051266" "TRAK2_rs13022344"
length(vector_of_gene_snps)
## [1] 75
Chosing other variables of interest
variables <- c("ID", "Diag", "SexLett", "Age", "Centre", "LGC_E4.")
Subsetting the dataframe with the desired variables
df <- df[, c(variables, vector_of_gene_snps)]
dim(df)
## [1] 2462 81
Then we check for missingness across all the dataset.
colSums(is.na(df))
## ID Diag SexLett Age
## 21 21 21 21
## Centre LGC_E4. TCN2_rs5749131 TCN2_rs5753231
## 21 21 82 72
## TCN2_rs16988828 TCN2_rs9606756 TCN2_rs757874 TCN2_rs1801198
## 47 56 66 62
## TCN2_rs5749135 TCN2_rs9621049 TCN2_rs1131603 TCN2_rs7289553
## 47 57 44 89
## TCN2_rs5997711 MTRR_rs1801394 MTRR_rs13181011 MTRR_rs326120
## 65 61 42 80
## MTRR_rs1532268 MTRR_rs7703033 MTRR_rs6555501 MTRR_rs162035
## 74 61 76 47
## MTRR_rs3815743 MTRR_rs162040 MTRR_rs10475399 SORT1_rs17585355
## 85 69 75 51
## SORT1_rs2228604 SORT1_rs7536292 SORT1_rs17646665 SORT1_rs1149175
## 59 83 37 46
## SORT1_rs10745354 BDNF_rs6265 BDNF_rs11030102 BDNF_rs11030104
## 58 47 83 69
## BDNF_rs11030119 BDNF_rs12288512 SHMT1_rs16961153 SHMT1_rs1979277
## 81 54 51 72
## SHMT1_rs669340 SHMT1_rs643333 SHMT1_rs9901160 RFC1_rs11096990
## 46 68 51 387
## RFC1_rs4975002 RFC1_rs4975009 RFC1_rs6851075 APOE_rs429358
## 84 58 80 172
## APOE_rs7412 APOE_rs597668 ABCA1_rs1800977 ABCA1_rs2422493
## 59 77 57 73
## CYP46A1_rs7157609 CYP46A1_rs4900442 DBH_rs1611115 DBH_rs1611131
## 100 48 47 434
## GAB2_rs4945261 GAB2_rs2373115 MAPT_rs242557 MAPT_rs2471738
## 58 52 53 94
## NPC1_rs2236707 NPC1_rs4800488 NR1H2_rs2695121 NR1H2_rs1052533
## 62 53 74 89
## NTRK2_rs1187323 NTRK2_rs1545285 PEMT_rs7946 PEMT_rs12325817
## 69 78 80 97
## VDR_rs731236 VDR_rs7975232 BIN1_rs744373 CD33_rs3865444
## 87 69 83 58
## CDK5_rs2069442 HMDX1_rs2071746 HMGCR_rs3931914 IL10_rs1800896
## 87 49 58 52
## LRP1_rs1799986 MTHFD1L_rs11754661 PRNP_rs1799990 SLC19A1_rs1051266
## 92 70 67 62
## TRAK2_rs13022344
## 61
We remove those cases which are missing data for all six first columns (not genotypes)
df <- df[rowSums(is.na(df[,1:6])) != 6,]
colSums(is.na(df[,1:6]))
## ID Diag SexLett Age Centre LGC_E4.
## 0 0 0 0 0 0
After that, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).
df[,c(vector_of_gene_snps)] <- lapply(df[,c(vector_of_gene_snps)], order_heterozygotes)
Then we create a list of objects that for each SNP contains information about its name, gene it belongs, genotype counts, genotype frequencies, allele frequencies and counts, minor allele and major allele etc.
list_of_objects <- generate_object(exists = exists('list_of_objects'),
dataset = df,
snps = vector_of_gene_snps,
verbose = verbose)
list_of_objects
## This is just a convenience function, invocated when print() is used, to provide an overview of the genes and snps present in this list. To view its contents, use list_of_objects$your_snp_id
##
##
## $snps
## [1] "rs5749131" "rs5753231" "rs16988828" "rs9606756" "rs757874"
## [6] "rs1801198" "rs5749135" "rs9621049" "rs1131603" "rs7289553"
## [11] "rs5997711" "rs1801394" "rs13181011" "rs326120" "rs1532268"
## [16] "rs7703033" "rs6555501" "rs162035" "rs3815743" "rs162040"
## [21] "rs10475399" "rs17585355" "rs2228604" "rs7536292" "rs17646665"
## [26] "rs1149175" "rs10745354" "rs6265" "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277" "rs669340"
## [36] "rs643333" "rs9901160" "rs11096990" "rs4975002" "rs4975009"
## [41] "rs6851075" "rs429358" "rs7412" "rs597668" "rs1800977"
## [46] "rs2422493" "rs7157609" "rs4900442" "rs1611115" "rs1611131"
## [51] "rs4945261" "rs2373115" "rs242557" "rs2471738" "rs2236707"
## [56] "rs4800488" "rs2695121" "rs1052533" "rs1187323" "rs1545285"
## [61] "rs7946" "rs12325817" "rs731236" "rs7975232" "rs744373"
## [66] "rs3865444" "rs2069442" "rs2071746" "rs3931914" "rs1800896"
## [71] "rs1799986" "rs11754661" "rs1799990" "rs1051266" "rs13022344"
##
## $genes
## [1] "TCN2" "MTRR" "SORT1" "BDNF" "SHMT1" "RFC1" "APOE"
## [8] "ABCA1" "CYP46A1" "DBH" "GAB2" "MAPT" "NPC1" "NR1H2"
## [15] "NTRK2" "PEMT" "VDR" "BIN1" "CD33" "CDK5" "HMDX1"
## [22] "HMGCR" "IL10" "LRP1" "MTHFD1L" "PRNP" "SLC19A1" "TRAK2"
##
##
## The list has a total of 75 snps and 28 genes
Each of the SNP objects present in the list_of_objects have the following structure:
list_of_objects[[1]]
## ###### rs5749131 ######
##
## [gene] TCN2
##
## [id] rs5749131
##
## [geno_count] AA = 173; AG = 478; GG = 288;
##
## [geno_freq] AA = 18.424; AG = 50.905; GG = 30.671;
##
## [allele_count] A = 824; G = 1054;
##
## [allele_freq] A = 43.876; G = 56.124;
##
## [minor_allele] A
##
## [major_allele] G
We can extract just the reference snp id from this object
vector_of_snps <- sapply(list_of_objects, function(x) x$id)
vector_of_snps <- unname(vector_of_snps)
vector_of_snps
## [1] "rs5749131" "rs5753231" "rs16988828" "rs9606756" "rs757874"
## [6] "rs1801198" "rs5749135" "rs9621049" "rs1131603" "rs7289553"
## [11] "rs5997711" "rs1801394" "rs13181011" "rs326120" "rs1532268"
## [16] "rs7703033" "rs6555501" "rs162035" "rs3815743" "rs162040"
## [21] "rs10475399" "rs17585355" "rs2228604" "rs7536292" "rs17646665"
## [26] "rs1149175" "rs10745354" "rs6265" "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277" "rs669340"
## [36] "rs643333" "rs9901160" "rs11096990" "rs4975002" "rs4975009"
## [41] "rs6851075" "rs429358" "rs7412" "rs597668" "rs1800977"
## [46] "rs2422493" "rs7157609" "rs4900442" "rs1611115" "rs1611131"
## [51] "rs4945261" "rs2373115" "rs242557" "rs2471738" "rs2236707"
## [56] "rs4800488" "rs2695121" "rs1052533" "rs1187323" "rs1545285"
## [61] "rs7946" "rs12325817" "rs731236" "rs7975232" "rs744373"
## [66] "rs3865444" "rs2069442" "rs2071746" "rs3931914" "rs1800896"
## [71] "rs1799986" "rs11754661" "rs1799990" "rs1051266" "rs13022344"
And then we can convert the categorical variables in the dataframe into the factor datatype.
df[-c(1,4)] <- lapply(df[-c(1,4)], as.factor)
Imputated data set preparation
First we read and store the the imputated data.
encoded_NA <- c("#NULL!", "NA")
imput_df <- read.csv2(file.path(input, "imputated.csv"),
na.strings = encoded_NA)
str(imput_df, list.len = 30)
## 'data.frame': 6357 obs. of 93 variables:
## $ id : int 3406 4010 12046 3784 2210 8831 1106 5426 3578 8432 ...
## $ sex : chr "Woman" "Woman" "Woman" "Woman" ...
## $ age.at.entry : num 58 58 59 58 60 60 55 57 57 57 ...
## $ Age.to.Use : num 60 60 60 60 60 60 60 61 61 61 ...
## $ Age75 : chr "lessthan75" "lessthan75" "lessthan75" "lessthan75" ...
## $ prevalent_dementia: int 0 0 0 0 0 0 0 0 0 0 ...
## $ incident_dementia : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Dx : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DiagName : chr "CTRL" "CTRL" "CTRL" "CTRL" ...
## $ ageatdementia : num NA NA NA NA NA NA NA NA NA NA ...
## $ followup : num 2.0589 1.8563 1.2238 1.9439 0.0493 ...
## $ E4status : chr "E4+" "2" "E4-" "E4-" ...
## $ apoe : int NA NA NA NA NA NA NA NA NA NA ...
## $ rs429358 : num 1.017 0.981 1.999 2 1.994 ...
## $ rs7412 : num 1 1.05 1 2 2 ...
## $ rs670139 : num 1 2 2 1 2 2 0 2 1 2 ...
## $ rs242557 : int 2 NA 1 NA 1 0 0 1 1 1 ...
## $ rs1052533 : int 1 NA 0 1 1 1 2 1 0 0 ...
## $ rs2471738 : int 2 1 0 0 0 0 0 0 1 1 ...
## $ rs4975002 : num 0.001 1.001 1.999 1 0.999 ...
## $ rs1131603 : num 2 2 1.08 2 1.66 ...
## $ rs7975232 : num 0 2 0 1.07 2 ...
## $ rs2695121 : num 1 2 1.03 2 1.01 ...
## $ rs3865444 : num 1.999 0.141 1.005 2 0.784 ...
## $ rs4975009 : num 0.006 1 2 0 0.808 ...
## $ rs1800977 : num 1 2 0 0 1 2 1 2 1 1 ...
## $ rs709149 : num 1 2 1 1 1 0 2 0 1 1 ...
## $ rs12288512 : num 1 2 2 1 1 ...
## $ rs4945261 : int 2 1 0 1 2 2 2 2 2 1 ...
## $ rs7946 : num 0.008 1 0 0 0 0 0 1 0 0 ...
## [list output truncated]
Then we subset the data of interest.
imput_variables <- c("id", "DiagName", "sex", "Age.to.Use", "E4status")
imput_df <- imput_df[,c(imput_variables, vector_of_snps)]
dim(imput_df)
## [1] 6357 80
Then we check for missingness across all the dataset.
colSums(is.na(imput_df))
## id DiagName sex Age.to.Use E4status rs5749131 rs5753231
## 0 0 0 0 224 431 431
## rs16988828 rs9606756 rs757874 rs1801198 rs5749135 rs9621049 rs1131603
## 431 431 431 431 431 431 431
## rs7289553 rs5997711 rs1801394 rs13181011 rs326120 rs1532268 rs7703033
## 431 431 431 431 431 431 431
## rs6555501 rs162035 rs3815743 rs162040 rs10475399 rs17585355 rs2228604
## 431 431 431 431 431 431 431
## rs7536292 rs17646665 rs1149175 rs10745354 rs6265 rs11030102 rs11030104
## 431 431 431 431 431 431 431
## rs11030119 rs12288512 rs16961153 rs1979277 rs669340 rs643333 rs9901160
## 431 431 431 431 431 431 431
## rs11096990 rs4975002 rs4975009 rs6851075 rs429358 rs7412 rs597668
## 431 431 431 431 431 431 431
## rs1800977 rs2422493 rs7157609 rs4900442 rs1611115 rs1611131 rs4945261
## 431 431 431 431 431 431 431
## rs2373115 rs242557 rs2471738 rs2236707 rs4800488 rs2695121 rs1052533
## 431 299 275 431 431 431 407
## rs1187323 rs1545285 rs7946 rs12325817 rs731236 rs7975232 rs744373
## 431 431 431 431 431 431 431
## rs3865444 rs2069442 rs2071746 rs3931914 rs1800896 rs1799986 rs11754661
## 431 431 431 431 431 431 431
## rs1799990 rs1051266 rs13022344
## 431 431 431
And we remove cases which have a missing value for the column E4status.
imput_df <- imput_df[is.na(imput_df$E4status) == FALSE,]
colSums(is.na(imput_df[,1:5]))
## id DiagName sex Age.to.Use E4status
## 0 0 0 0 0
Then, we import data from the allele schema used in the imputation, in order to reconvert it back to genotypes.
imput_scheme <- read.csv(file.path(input, "imputation_scheme.csv"), sep = ";")
imput_scheme <- imput_scheme[imput_scheme$SNP %in% vector_of_snps,]
colnames(imput_scheme) <- c("snp_id", "reference", "alternative")
There is still some snp that is not found in the reference scheme. It seems some of the snps in the dataset were directly genotyped and codified as dummy allele dosage data, so we will use the human reference genome GRCh37 to recodify them.
genotyped_snps <- setdiff(vector_of_snps, imput_scheme$snp_id)
genotyped_snps
## [1] "rs1052533"
GRCh37 <- read.table(file.path(input, "GRCh37_snps.txt"), quote = "\"", comment.char = "")
colnames(GRCh37) <- c("chromosome", "location", "snp_id", "reference", "alternative")
index <- nrow(imput_scheme)
for (snp in genotyped_snps) {
i <- which(genotyped_snps == snp)
imput_scheme[index + i,] <- GRCh37[GRCh37$snp_id == snp,][,c(3,4,5)]
}
row.names(imput_scheme) <- NULL
So the final dataframe used to recodify the imputated data into genotypes will be the following one:
imput_scheme
|
snp_id
|
reference
|
alternative
|
|
rs10475399
|
A
|
G
|
|
rs1051266
|
T
|
C
|
|
rs1052533
|
G
|
A
|
|
rs10745354
|
T
|
C
|
|
rs11030102
|
C
|
G
|
|
rs11030104
|
A
|
G
|
|
rs11030119
|
G
|
A
|
|
rs11096990
|
C
|
T
|
|
rs1131603
|
T
|
C
|
|
rs1149175
|
G
|
A
|
|
rs11754661
|
G
|
A
|
|
rs1187323
|
A
|
C
|
|
rs12288512
|
G
|
A
|
|
rs12325817
|
C
|
G
|
|
rs13022344
|
T
|
C
|
|
rs13181011
|
T
|
C
|
|
rs1532268
|
C
|
T
|
|
rs1545285
|
T
|
G
|
|
rs1611115
|
C
|
T
|
|
rs1611131
|
A
|
G
|
|
rs162035
|
G
|
A
|
|
rs162040
|
A
|
C
|
|
rs16961153
|
G
|
A
|
|
rs16988828
|
A
|
G
|
|
rs17585355
|
A
|
C
|
|
rs17646665
|
A
|
G
|
|
rs1799986
|
C
|
T
|
|
rs1799990
|
A
|
G
|
|
rs1800896
|
T
|
C
|
|
rs1800977
|
G
|
A
|
|
rs1801198
|
C
|
G
|
|
rs1801394
|
A
|
G
|
|
rs1979277
|
G
|
A
|
|
rs2069442
|
C
|
G
|
|
rs2071746
|
A
|
T
|
|
rs2228604
|
G
|
T
|
|
rs2236707
|
C
|
T
|
|
rs2373115
|
C
|
A
|
|
rs2422493
|
G
|
A
|
|
rs242557
|
A
|
G
|
|
rs2471738
|
T
|
C
|
|
rs2695121
|
C
|
T
|
|
rs326120
|
A
|
G
|
|
rs3815743
|
A
|
G
|
|
rs3865444
|
C
|
A
|
|
rs3931914
|
C
|
G
|
|
rs429358
|
T
|
C
|
|
rs4800488
|
A
|
C
|
|
rs4900442
|
C
|
T
|
|
rs4945261
|
G
|
A
|
|
rs4975002
|
T
|
G
|
|
rs4975009
|
T
|
C
|
|
rs5749131
|
G
|
A
|
|
rs5749135
|
C
|
T
|
|
rs5753231
|
C
|
T
|
|
rs597668
|
T
|
C
|
|
rs5997711
|
C
|
T
|
|
rs6265
|
C
|
T
|
|
rs643333
|
G
|
T
|
|
rs6555501
|
T
|
C
|
|
rs669340
|
C
|
G
|
|
rs6851075
|
T
|
C
|
|
rs7157609
|
G
|
A
|
|
rs7289553
|
A
|
G
|
|
rs731236
|
A
|
G
|
|
rs7412
|
C
|
T
|
|
rs744373
|
A
|
G
|
|
rs7536292
|
T
|
C
|
|
rs757874
|
G
|
T
|
|
rs7703033
|
G
|
A
|
|
rs7946
|
C
|
T
|
|
rs7975232
|
C
|
A
|
|
rs9606756
|
A
|
G
|
|
rs9621049
|
C
|
T
|
|
rs9901160
|
G
|
A
|
First, though the imputation data is converted to the numeric datatype.
imput_df[, 6:ncol(imput_df)] <- lapply(imput_df[,6:ncol(imput_df)],
function(x) as.numeric(x))
Then we proceed to the recoding of the imputation data into genotyped one.
imput_df <- genotype_imputated_df(list_of_objects = list_of_objects,
df = imput_df,
scheme = imput_scheme,
match_strands = T,
verbose = verbose)
Then again like we did with the genotyped dataset, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).
imput_df[,c(vector_of_snps)] <- lapply(imput_df[,c(vector_of_snps)], order_heterozygotes)
And we convert data types from characters to factors
imput_df[-c(1,4)] <- lapply(imput_df[-c(1,4)], as.factor)
Merging imputated and genotyped datasets
Renaming variables names in the datasets
colnames(df) <- c("ID", "Diag", "Sex", "Age_to_use", "Centre", "E4status", vector_of_snps)
colnames(imput_df) <- c("ID", "Diag", "Sex", "Age_to_use", "E4status", vector_of_snps)
As we are studying LOAD, subset cases higher or equal than 60 years old in the datasets
df_old <- df
imput_df_old <- imput_df
df <- df[df$Age_to_use >= 60,]
imput_df <- imput_df[imput_df$Age_to_use >= 60,]
And we can see that 81 cases were removed from the genotype dataset, and 0 from the imputated.
nrow(df_old) - nrow(df)
## [1] 81
# 81
nrow(imput_df_old) - nrow(imput_df)
## [1] 0
# 0
We can calculate the median of the combined population
median(c(df$Age_to_use, imput_df$Age_to_use))
## [1] 82
# 82
Renaming the levels of E4status in the genotyped dataset.
levels(df$E4status)
## [1] "0" "1"
levels(df$E4status) <- c("E4-", "E4+")
And the levels of Diagnosis in the imputated dataset.
levels(imput_df$Diag)
## [1] "CTRL" "dementiaAD"
levels(imput_df$Diag) <- c("Control", "AD")
We also create a binary variable in both datasets that tells whether the age of the individual is equal or above the median of the population. In this case, as we saw earlier it is 82.
df$Age82 <- ifelse(df$Age_to_use >= 82, ">82", "<82")
imput_df$Age82 <- ifelse(imput_df$Age_to_use >= 82, ">82", "<82")
df$Age82 <- as.factor(df$Age82)
We found a case of unknown gender in the imputated dataset, so we remove it and reset the levels. And then rename them as “Male” and “Female”.
table(imput_df$Sex)
##
## 1 Man Woman
## 1 2463 3669
imput_df$Sex <- as.character(imput_df$Sex)
imput_df <- imput_df[imput_df$Sex != 1,]
imput_df$Sex <- factor(imput_df$Sex)
levels(imput_df$Sex)
## [1] "Man" "Woman"
levels(imput_df$Sex) <- c("Male", "Female")
After a quick look at the data we decide to codify as missing an unexplained level in the E4status variable at the Rotterdam dataset. And remove those missing cases.
table(imput_df$E4status)
##
## 2 E4- E4+
## 141 4423 1568
imput_df$E4status <- as.character(imput_df$E4status)
imput_df <- imput_df[imput_df$E4status != 2,]
imput_df$E4status <- factor(imput_df$E4status)
And we create a new variable “Centre” to keep track that this cases come from the center of Rotterdam.
imput_df$Centre <- "ROTTERDAM"
The last step before merging is extracting the data from Rotterdam to fill up the list_of_objects created in a previous step.
list_of_objects <- generate_object(exists = exists('list_of_objects'),
dataset = imput_df,
snps = vector_of_snps)
Then, the final structure obtained would be similar to the following one:
str(list_of_objects)
## This is just a convenience function, invocated when str() is used, to provide an overview of the list elements. To view its contents, use list_of_objects$your_snp_id
##
## An example of the structure of the objects in the list is the following one:
##
## rs5749131 : List of 14
## $ gene : chr "TCN2"
## $ id : chr "rs5749131"
## $ geno_count : 'table' int [1:3(1d)] 173 478 288
## $ geno_freq : 'table' num [1:3(1d)] 18.4 50.9 30.7
## $ allele_count : Named num [1:2] 824 1054
## $ allele_freq : Named num [1:2] 43.9 56.1
## $ minor_allele : chr "A"
## $ major_allele : chr "G"
## $ imput_geno_count : 'table' int [1:3(1d)] 889 2224 1455
## $ imput_geno_freq : 'table' num [1:3(1d)] 19.5 48.7 31.9
## $ imput_allele_count: Named num [1:2] 4002 5134
## $ imput_allele_freq : Named num [1:2] 43.8 56.2
## $ imput_minor_allele: chr "A"
## $ imput_major_allele: chr "G"
Once we’ve done that, we can proceed with the merging.
matching_variables <- c("ID", "Diag", "Sex", "E4status", "Age_to_use", "Age82", "Centre", vector_of_snps)
All <- merge(x = df, y = imput_df, by = matching_variables, all = T)
str(All, list.len = 30)
## 'data.frame': 8351 obs. of 82 variables:
## $ ID : chr "10" "100" "1000" "10000" ...
## $ Diag : Factor w/ 2 levels "AD","Control": 2 2 1 2 2 2 2 2 2 2 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 1 2 2 1 ...
## $ E4status : Factor w/ 2 levels "E4-","E4+": 1 1 2 1 1 1 1 1 2 1 ...
## $ Age_to_use: num 82 77 82 78 92 78 82 83 86 87 ...
## $ Age82 : Factor w/ 2 levels "<82",">82": 2 1 2 1 2 1 2 2 2 2 ...
## $ Centre : Factor w/ 7 levels "BRISTOL","MADRID",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ rs5749131 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 2 2 2 ...
## $ rs5753231 : Factor w/ 3 levels "CC","CT","TT": 2 1 1 2 1 2 2 1 1 2 ...
## $ rs16988828: Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 1 1 1 ...
## $ rs9606756 : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 2 1 2 ...
## $ rs757874 : Factor w/ 3 levels "GG","GT","TT": 1 1 2 2 1 2 2 1 1 1 ...
## $ rs1801198 : Factor w/ 3 levels "CC","CG","GG": 3 1 2 2 1 2 2 2 2 2 ...
## $ rs5749135 : Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 3 2 2 1 2 1 ...
## $ rs9621049 : Factor w/ 3 levels "CC","CT","TT": 1 2 1 1 1 1 1 2 1 2 ...
## $ rs1131603 : Factor w/ 3 levels "CC","CT","TT": 3 3 3 3 2 3 3 3 3 3 ...
## $ rs7289553 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 1 2 2 ...
## $ rs5997711 : Factor w/ 3 levels "CC","CT","TT": 3 1 2 2 1 2 2 2 2 2 ...
## $ rs1801394 : Factor w/ 3 levels "AA","AG","GG": 3 2 1 3 3 3 1 2 3 3 ...
## $ rs13181011: Factor w/ 3 levels "CC","CT","TT": 1 3 3 3 2 2 3 3 2 2 ...
## $ rs326120 : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 3 2 1 1 ...
## $ rs1532268 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 3 2 2 3 3 2 2 ...
## $ rs7703033 : Factor w/ 3 levels "AA","AG","GG": 3 2 3 1 2 2 3 2 2 2 ...
## $ rs6555501 : Factor w/ 3 levels "CC","CT","TT": 3 2 3 1 2 2 3 2 2 2 ...
## $ rs162035 : Factor w/ 3 levels "AA","AG","GG": 3 3 3 2 3 3 3 3 3 3 ...
## $ rs3815743 : Factor w/ 3 levels "AA","AG","GG": 1 1 2 1 1 1 1 1 1 1 ...
## $ rs162040 : Factor w/ 3 levels "AA","AC","CC": 1 2 1 1 1 1 3 1 1 1 ...
## $ rs10475399: Factor w/ 3 levels "AA","AG","GG": 3 2 2 2 3 3 1 2 3 3 ...
## $ rs17585355: Factor w/ 3 levels "AA","AC","CC": 1 1 1 1 1 1 1 1 1 2 ...
## $ rs2228604 : Factor w/ 3 levels "AA","AC","CC": 3 3 2 2 1 2 3 2 2 2 ...
## [list output truncated]
Before proceeding into quality control we should check missingness in the obtained dataset.
colSums(is.na(All))
## ID Diag Sex E4status Age_to_use Age82 Centre
## 0 0 0 0 0 0 0
## rs5749131 rs5753231 rs16988828 rs9606756 rs757874 rs1801198 rs5749135
## 458 450 423 432 444 440 425
## rs9621049 rs1131603 rs7289553 rs5997711 rs1801394 rs13181011 rs326120
## 434 420 465 443 439 420 454
## rs1532268 rs7703033 rs6555501 rs162035 rs3815743 rs162040 rs10475399
## 451 439 453 425 462 445 453
## rs17585355 rs2228604 rs7536292 rs17646665 rs1149175 rs10745354 rs6265
## 427 437 461 415 423 436 424
## rs11030102 rs11030104 rs11030119 rs12288512 rs16961153 rs1979277 rs669340
## 460 445 459 432 429 448 423
## rs643333 rs9901160 rs11096990 rs4975002 rs4975009 rs6851075 rs429358
## 445 427 764 459 435 456 548
## rs7412 rs597668 rs1800977 rs2422493 rs7157609 rs4900442 rs1611115
## 437 453 434 451 477 426 425
## rs1611131 rs4945261 rs2373115 rs242557 rs2471738 rs2236707 rs4800488
## 809 436 430 310 327 439 431
## rs2695121 rs1052533 rs1187323 rs1545285 rs7946 rs12325817 rs731236
## 450 444 446 452 456 474 465
## rs7975232 rs744373 rs3865444 rs2069442 rs2071746 rs3931914 rs1800896
## 446 461 436 461 425 436 428
## rs1799986 rs11754661 rs1799990 rs1051266 rs13022344
## 470 447 443 440 439
Finally, we divide the dataset into regions according to the centres.
All$Region <- ifelse(All$Centre == "MADRID" | All$Centre == "OVIEDO" | All$Centre == "SANTANDER", "Spain", "N.Eur")
Quality control
We will perform a quality control of the samples before proceeding to the analysis step. An example of typical quality control procedures usually performed can be found here
Formatting dataset
Once we’ve merged both datasets we can proceed with quality control procedures.
We’ve previosly prepared a map file with the snps under study. It was generated with the GRCh38 genome assembly version.
map <- read.delim(file.path(input, "raw_data.map"), header = FALSE)
colnames(map) <- c("Chr", "SNP", "GD", "BPP")
The file has the following structure:
map
|
Chr
|
SNP
|
GD
|
BPP
|
|
1
|
rs10745354
|
0
|
109389286
|
|
1
|
rs1149175
|
0
|
109379755
|
|
1
|
rs17585355
|
0
|
109315193
|
|
1
|
rs17646665
|
0
|
109369429
|
|
1
|
rs1800896
|
0
|
206773552
|
|
1
|
rs2228604
|
0
|
109342153
|
We can see that the snps ids are exactly the same found in our dataset so we already got all the information we need for the map file.
setdiff(map$SNP, names(list_of_objects))
## character(0)
setdiff(names(list_of_objects), map$SNP)
## character(0)
Now, we take care of the ped file. First, we create an empty ped file structure that we will fill up with the data.
ped <- data.frame(matrix(ncol = 6, nrow = nrow(All)))
colnames(ped) <- c("FID", "IID", "PID", "MID", "Sex", "P")
Then we introduce the columns values for each of the defined variables
ped$FID <- All$ID #It is the same plink does when the flag --no-fid is used
ped$IID <- All$ID
ped$PID <- 0
ped$MID <- 0
ped$Sex <- sapply(All$Sex, function(x) {if (is.na(x)) {0} else if (x == "Male") {1} else if (x == "Female") {2} else {0}})
ped$P <- sapply(All$Diag, function(x) {if (is.na(x)) {-9} else if (x == "Control") {1} else if (x == "AD") {2} else {-9}})
If we take look we can see that the variables have been coded correctly
head(All[,c("ID", "Sex", "Diag")])
## ID Sex Diag
## 1 10 Male Control
## 2 100 Female Control
## 3 1000 Male AD
## 4 10000 Female Control
## 5 10004 Female Control
## 6 10005 Male Control
head(ped[,c("IID", "Sex", "P")])
## IID Sex P
## 1 10 1 1
## 2 100 2 1
## 3 1000 1 2
## 4 10000 2 1
## 5 10004 2 1
## 6 10005 1 1
Now, then the final step would be to add the genotypes, and as the version of plink we are using (PLINK v1.90) parses the genotypes, we don’t need to split the columns we have into allele columns and we can just bind it directly to our ped file.
ped <- cbind(ped, All[,map$SNP])
But we have yet some missing values remaining in our dataset.
colSums(is.na(ped))
## FID IID PID MID Sex P rs10745354
## 0 0 0 0 0 0 436
## rs1149175 rs17585355 rs17646665 rs1800896 rs2228604 rs7536292 rs13022344
## 423 427 415 428 437 461 439
## rs744373 rs11096990 rs4975002 rs4975009 rs6851075 rs10475399 rs13181011
## 461 764 459 435 456 453 420
## rs1532268 rs162035 rs162040 rs1801394 rs326120 rs3815743 rs3931914
## 451 425 445 439 454 462 436
## rs6555501 rs7703033 rs11754661 rs2069442 rs1187323 rs1545285 rs1611115
## 453 439 447 461 446 452 425
## rs1611131 rs1800977 rs2422493 rs11030102 rs11030104 rs11030119 rs12288512
## 809 434 451 460 445 459 432
## rs2373115 rs4945261 rs6265 rs1799986 rs731236 rs7975232 rs4900442
## 430 436 424 470 465 446 426
## rs7157609 rs12325817 rs16961153 rs1979277 rs242557 rs2471738 rs643333
## 477 474 429 448 310 327 445
## rs669340 rs7946 rs9901160 rs2236707 rs4800488 rs1052533 rs2695121
## 423 456 427 439 431 444 450
## rs3865444 rs429358 rs597668 rs7412 rs1799990 rs1051266 rs1131603
## 436 548 453 437 443 440 420
## rs16988828 rs1801198 rs2071746 rs5749131 rs5749135 rs5753231 rs5997711
## 423 440 425 458 425 450 443
## rs7289553 rs757874 rs9606756 rs9621049
## 465 444 432 434
And we convert the missing values to 00, as plink expects.
ped[,map$SNP] <- lapply(ped[,map$SNP], as.character)
ped[is.na(ped)] <- "00"
sum(is.na(ped))
## [1] 0
Then we can save the ped file in order to use it as input for plink.
write.table(ped, file = file.path(input, "raw_data.ped"), quote = F, sep = "\t", row.names = F, col.names = F)
Checking plink version
We will use the command line plink program. To get the version, you can just type:
plink --version
# PLINK v1.90b6.21 64-bit (19 Oct 2020)
## PLINK v1.90b6.21 64-bit (19 Oct 2020)
Making binary files
The first step is to create a folder where to store the output that we will obtain in all subsequent steps using the program.
mkdir -p "$PWD"/data/Output/plink/0-Data
run_shell('mkdir %CD%/data/Output/plink/0-Data')
The next step step will be to convert the files obtained into binary files to speed up the computation.
plink --file "$PWD"/data/Input/raw_data --make-bed --out "$PWD"/data/Output/plink/0-Data/binary_data
run_shell('plink --file %CD%/data/Input/raw_data --make-bed --out %CD%/data/Output/plink/0-Data/binary_data', ignore.stdout = !verbose)
Removing missingness
We first filter SNPs and individuals based on a relaxed threshold (0.2; >20%), as this will filter out SNPs and individuals with very high levels of missingness. Then a filter with a more stringent threshold will be applied (0.02). We do first SNP filtering before individual filtering.
mkdir -p "$PWD"/data/Output/plink/1-QC/1-Missingness
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --missing --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4
run_shell('mkdir %CD%/data/Output/plink/1-QC/1-Missingness')
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --missing --out %CD%/data/Output/plink/1-QC/1-Missingness/missing', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_1', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_2', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_3', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_4', ignore.stdout = !verbose)
Removing SNPs with lower MAF than threshold
SNPs with a low MAF are rare, therefore power is lacking for detecting SNP‐phenotype associations. These SNPs are also more prone to genotyping errors. The MAF threshold depends on sample size, larger samples can use lower MAF thresholds. As the sample size in this study is moderate (N = 8716), we use the 0.05 as MAF threshold.
mkdir "$PWD"/data/Output/plink/1-QC/2-MAF
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele
run_shell('mkdir %CD%/data/Output/plink/1-QC/2-MAF')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out %CD%/data/Output/plink/1-QC/2-MAF/minor_allele', ignore.stdout = !verbose)
Looking at deviations from Hardy-Weinberg equilibrium
Deviations from HWE is a common indicator of genotyping error, but it may also indicate evolutionary selection. We will use a threshold of 10^-6 for controls and 10^-10 for cases.
mkdir "$PWD"/data/Output/plink/1-QC/3-HWE
plink --bfile "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-4 --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1
run_shell('mkdir %CD%/data/Output/plink/1-QC/3-HWE')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-6 --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1', ignore.stdout = !verbose)
Removing individuals with a heterozygosity rate deviating more than 3SD from the mean
Deviations can indicate sample contamination or inbreeding. Heterozygosity checks are performed on a set of SNPs that are not highly correlated. To generate this list of non-(highly)correlated SNPs, we exclude high inversion regions.
mkdir "$PWD"/data/Output/plink/1-QC/4-Het
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude "$PWD"/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP
run_shell('mkdir %CD%/data/Output/plink/1-QC/4-Het')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude %CD%/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out %CD%/data/Output/plink/1-QC/4-Het/indepSNP', ignore.stdout = !verbose)
Once we’ve done that we can compute the heterozygosity rates
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out "$PWD"/data/Output/plink/1-QC/4-Het/Het_check
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract %CD%/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out %CD%/data/Output/plink/1-QC/4-Het/Het_check', ignore.stdout = !verbose)
From this file we can generate a list of individuals who deviate more than 3 standard deviations from the heterozigosity rate mean.
# All credit to Andries Marees at https://github.com/MareesAT/GWA_tutorial
het <- read.table(file.path(output, "plink", "1-QC", "4-Het", "Het_check.het"), head = TRUE)
het$HET_RATE <- (het$"N.NM." - het$"O.HOM.")/het$"N.NM."
het_fail <- subset(het, (het$HET_RATE < mean(het$HET_RATE) - 3*sd(het$HET_RATE)) | (het$HET_RATE > mean(het$HET_RATE) + 3*sd(het$HET_RATE)))
het_fail$HET_DST <- (het_fail$HET_RATE - mean(het$HET_RATE))/sd(het$HET_RATE)
write.table(het_fail[,1:2], file.path(output, "plink", "1-QC", "4-Het", "fail-het-qc.txt"), row.names = FALSE, quote = F)
Now, we proceed to remove those heterozygosity rate outliers
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove "$PWD"/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove %CD%/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out %CD%/data/Output/plink/1-QC/4-Het/heterozigosity', ignore.stdout = !verbose)
Population structure analysis
We can also check for population substrure in our dataset, for instance by performing a PCA.
This additional analysis is only actually performed on Unix computers, if in windows I have just provided the commands used and and example of the output obtained. It also isn’t run by default, because of the bottleneck caused by this additional analysis. The results obtained are shown as recorded on a .png file of the PCA plot. You can control if the analysis should be performed by setting the run_PCA variable as TRUE or FALSE.
run_PCA <- FALSE
For that, we downloaded a 1000 genomes file of 629 individuals which were genotyped in blablabla SNPs. This file was downloaded from here: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz.
After downloading it the file was converted to plink files, and performed QC as described in the Population Stratification tutorial in https://github.com/MareesAT/GWA_tutorial.
Removing unshared SNPs
Just the snps present in both datasets were kept to be able to make the comparison. Due to file size, of the thousand genomes files, we’ve already extracted the relevant SNPs from the thousand genomes plink files. To do so for the EP dataset we might do:
mkdir -p "$PWD"/data/Output/plink/2-POP/
awk '{print$2}' "$PWD"/data/Input/1KG_PCA/1KG.bim > "$PWD"/data/Output/plink/2-POP/1KG_snps.txt
plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --extract "$PWD"/data/Output/plink/2-POP/1KG_snps.txt --make-bed --recode --out "$PWD"/data/Output/plink/2-POP/EP
cat "$PWD"/data/Output/plink/2-POP/1KG_snps.txt | wc -l
Update build
Then, the next step would be to check that both datasets have the same build and if not update the older 1KG file accordingly.
awk '{print$2,$4}' "$PWD"/data/Output/plink/2-POP/EP.map > "$PWD"/data/Output/plink/2-POP/buildEP.txt
plink --bfile "$PWD"/data/Input/1KG_PCA/1KG --update-map "$PWD"/data/Output/plink/2-POP/buildEP.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/1KG_PCA
Prepare merging
In order to merge both files, first we have to ascertain that both files refer to the same reference and alternative alleles.
awk '{print$2,$5}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_tmp
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/all_differences.txt
cat "$PWD"/data/Output/plink/2-POP/all_differences.txt
Those differences might be just caused by looking at different strands so we will flip those bases and see if the error persists.
awk '{print$1}' "$PWD"/data/Output/plink/2-POP/all_differences.txt | sort -u > "$PWD"/data/Output/plink/2-POP/flip_list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj --flip "$PWD"/data/Output/plink/2-POP/flip_list.txt --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj_2
## Now check if they are still problematic
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/still_differences.txt
cat "$PWD"/data/Output/plink/2-POP/still_differences.txt
Merge
And we can see that there are no remaining differences anymore so we shouldn’t remove any SNP and we can proceed to merge both files.
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj_2 --bmerge "$PWD"/data/Output/plink/2-POP/1KG_PCA.bed "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim "$PWD"/data/Output/plink/2-POP/1KG_PCA.fam --allow-no-sex --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA
Labeling the superpopulations
Before doing the PCA we can label the data according to different human superpopulations (AFR, EUR etc) in order to be able to keep track of them when we plot them graphically. For that we get a file with population information from the 1000 genomes and transform the subpopulation codes in our files in superpopulation ones.
# Note: for macOS/BSD users use GNU sed instead of default sed, linux users may use sed directly instead of gsed
wget -P "$PWD"/data/Output/plink/2-POP/ ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/20100804.ALL.panel
## rename populations
awk '{print$1,$1,$2}' "$PWD"/data/Output/plink/2-POP/20100804.ALL.panel > "$PWD"/data/Output/plink/2-POP/pop_1kG.txt
gsed 's/JPT/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt
gsed 's/ASW/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt
gsed 's/CEU/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt
gsed 's/CHB/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt
gsed 's/CHD/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG6.txt
gsed 's/YRI/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG6.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG7.txt
gsed 's/LWK/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG7.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG8.txt
gsed 's/TSI/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG8.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG9.txt
gsed 's/MXL/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG9.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG10.txt
gsed 's/GBR/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG10.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG11.txt
gsed 's/FIN/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG11.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG12.txt
gsed 's/CHS/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG12.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG13.txt
gsed 's/PUR/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG13.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG14.txt
# Create our own population file
awk '{print$1,$2,"EP"}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.fam > "$PWD"/data/Output/plink/2-POP/pop_EP.txt
# Concatenate population files.
cat "$PWD"/data/Output/plink/2-POP/pop_1kG14.txt "$PWD"/data/Output/plink/2-POP/pop_EP.txt | gsed -e '1i\FID IID SUPERPOP' > "$PWD"/data/Output/plink/2-POP/popfile.txt
# remove temporary files from above
rm -f "$PWD"/data/Output/plink/2-POP/pop_*.txt
PCA
With the data merged we can perform PCA on plink.
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA --pca --out "$PWD"/data/Output/plink/2-POP/PCA_results
We will plot the results using a small Python script (all credit to Xavier Farré)
# engine.opts='-l' loads files that are normally loaded when you start the shell like for instance ~/.bash_profile. In this case, python 3 is aliased to python, instead of using python 2.7.
python --version
python "$PWD"/data/Input/1KG_PCA/plot_PCA.py
So, as we can see there is no clustering visible between any of the populations. That is probably because of the low number of SNPs available (n < 70) and little to no allele frequency between these superpopulations for these markers.
Thus, no further sample was removed from the EP dataset because even if it was not from a European individual it would not affect the outcome of the results.
Resulting cases and SNPs
To obtain the SNPs and cases that are kept after QC, first we can recode the binary fails obtained into map and ped files
mkdir -p "$PWD"/data/Output/plink/1-QC/Final
plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --recode --out "$PWD"/data/Output/plink/1-QC/Final/after_QC
run_shell('mkdir %CD%/data/Output/plink/1-QC/Final')
run_shell('plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --recode --out "$PWD"/data/Output/plink/1-QC/Final/after_QC')
Then we can import those files.
path <- file.path(output, "plink", "1-QC", "Final")
QC_map <- read.delim(file.path(path, "after_QC.map"), header = FALSE)
QC_ped <- read.table(file.path(path, "after_QC.ped"), quote = "\"", comment.char = "")
The second column of the map file contains the SNPs that remain after the QC and the second column of the ped file contains the IIDs kept. Thus, we can see the difference between those in the dataset without filtering and those kept after QC procedures.
snps_remain <- QC_map$V2
ids_remain <- QC_ped$V2
snps_removed <- setdiff(names(list_of_objects), snps_remain)
ids_removed <- setdiff(All$ID, ids_remain)
length(snps_removed)
## [1] 7
length(ids_removed)
## [1] 855
So 6 SNPs were removed as well as 874 cases.
First we can see to which genes those SNPs belonged.
genes <- sapply(snps_removed, function(SNP) {list_of_objects[[SNP]]$gene})
sort(table(genes), decreasing = T)
## genes
## MAPT APOE DBH NR1H2 RFC1 TCN2
## 2 1 1 1 1 1
And then we can obtain the center from which the cases removed were obtained and the diagnosis group.
centers <- sapply(ids_removed, function(id) {All$Centre[All$ID == id]}, USE.NAMES = F)
sort(table(centers), decreasing = T)
## centers
## ROTTERDAM NOTTINGHAM MADRID OVIEDO BRISTOL OPTIMA SANTANDER
## 421 127 115 61 57 47 27
diagnoses <- sapply(ids_removed, function(id) {All$Diag[All$ID == id]}, USE.NAMES = F)
sort(table(diagnoses), decreasing = T)
## diagnoses
## Control AD
## 516 339
Then, we can keep only the cases and SNPs that passed the QC procedure in the dataset…
All <- All[All$ID %in% ids_remain,]
variables <- c("ID", "Diag", "Sex", "E4status", "Age_to_use", "Age82", "Centre", "Region")
All <- All[,c(variables, snps_remain)]
And in the list and vectors of SNPs.
list_of_objects <- lapply(list_of_objects, function(SNP) {if (SNP$id %in% snps_remain) {SNP}})
list_of_objects[sapply(list_of_objects, is.null)] <- NULL
names(list_of_objects) <- sapply(list_of_objects, function(SNP) {SNP$id})
class(list_of_objects) <- "SNP_set"
list_of_objects
## This is just a convenience function, invocated when print() is used, to provide an overview of the genes and snps present in this list. To view its contents, use list_of_objects$your_snp_id
##
##
## $snps
## [1] "rs5749131" "rs5753231" "rs16988828" "rs9606756" "rs757874"
## [6] "rs1801198" "rs5749135" "rs9621049" "rs7289553" "rs5997711"
## [11] "rs1801394" "rs13181011" "rs326120" "rs1532268" "rs7703033"
## [16] "rs6555501" "rs162035" "rs3815743" "rs162040" "rs10475399"
## [21] "rs17585355" "rs2228604" "rs7536292" "rs17646665" "rs1149175"
## [26] "rs10745354" "rs6265" "rs11030102" "rs11030104" "rs11030119"
## [31] "rs12288512" "rs16961153" "rs1979277" "rs669340" "rs643333"
## [36] "rs9901160" "rs4975002" "rs4975009" "rs6851075" "rs7412"
## [41] "rs597668" "rs1800977" "rs2422493" "rs7157609" "rs4900442"
## [46] "rs1611115" "rs4945261" "rs2373115" "rs2236707" "rs4800488"
## [51] "rs2695121" "rs1187323" "rs1545285" "rs7946" "rs12325817"
## [56] "rs731236" "rs7975232" "rs744373" "rs3865444" "rs2069442"
## [61] "rs2071746" "rs3931914" "rs1800896" "rs1799986" "rs11754661"
## [66] "rs1799990" "rs1051266" "rs13022344"
##
## $genes
## [1] "TCN2" "MTRR" "SORT1" "BDNF" "SHMT1" "RFC1" "APOE"
## [8] "ABCA1" "CYP46A1" "DBH" "GAB2" "NPC1" "NR1H2" "NTRK2"
## [15] "PEMT" "VDR" "BIN1" "CD33" "CDK5" "HMDX1" "HMGCR"
## [22] "IL10" "LRP1" "MTHFD1L" "PRNP" "SLC19A1" "TRAK2"
##
##
## The list has a total of 68 snps and 27 genes
vector_of_snps <- snps_remain
Defining inheritance models
The next step is to obtaining all possible snp inheritance model combinations and adding the new columns to the dataset.
recessive <- paste0(vector_of_snps, "r")
additive <- paste0(vector_of_snps, "a")
dominant <- paste0(vector_of_snps, "d")
inheritance <- c(recessive, additive, dominant)
All[inheritance] <- NA
Then we define the variables (columns) order in the dataset.
col_order <- c("ID", "Diag", "Sex", "Age_to_use", "Age82", "Region", "E4status", "Centre", vector_of_snps, inheritance)
All <- All[,col_order]
Finally we code the inheritance with numbers depending on the genotype values for each snp. (0 reference, 1 risk allele effect, 2 two risk alleles effect {just for the additive model})
All <- generate_inheritances(snps = list_of_objects, data = All)
Creating data subsets
We create a data subset for each of the regions.
for (region in levels(All$Region)) {
subset <- subset(All, Region == region)
assign(region, subset)
}
And for each of the centers.
for (center in levels(All$Centre)) {
subset <- subset(All, Centre == center)
center <- str_to_title(center)
assign(center, subset)
}
Analysis
Main effects
First, we select the datasets into which we want to perform the analysis and then all covariates we want to control for.
DATASETS <- c("All", "N.Eur", "Spain")
variables <- c("Sex", "E4status", "Age82", "Centre")
Then we reorder the datasets alphabetically and set Santander as the reference level in the Spanish instead of Madrid as the former has a higher N and also making sure the analysis are performed with respect to the Control reference level not the Diagnosed ones.
for (i in seq_along(DATASETS)) {
assign(DATASETS[i], get(DATASETS[i])[,order(names(get(DATASETS[i])))])
}
Spain$Centre <- relevel(Spain$Centre, ref = 6)
Finally we perform the main effects analysis for the selected SNPs.
master_list <- perform_analysis(.mode = "main_effects", .data = DATASETS, snps = list_of_objects, covariates = variables)
This function outputs a list with the results of the glm contained in it (best model of inheritance and glm results). An example of the structure of the list with only the main effects performed can be seen here, consisting in the first snp of the All (N.Eur + Spain) dataset:
## $rs5749131
## $rs5749131$Gene
## [1] "TCN2"
##
## $rs5749131$Best_model
## rs5749131d
## "[1] AIC quasi= 7397.08954"
##
## $rs5749131$Main_effects
## $rs5749131$Main_effects$rs5749131a
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.627 0.161 1.873 1.365 2.568 1.004591e-04
## SexMale -0.559 0.061 0.572 0.507 0.645 9.122544e-20
## E4statusE4+ 1.086 0.060 2.963 2.632 3.336 1.228676e-70
## Age82>82 0.310 0.061 1.364 1.209 1.539 4.482749e-07
## CentreMADRID -0.843 0.182 0.431 0.301 0.615 3.665023e-06
## CentreNOTTINGHAM -0.509 0.196 0.601 0.410 0.882 9.323218e-03
## CentreOPTIMA -0.544 0.179 0.580 0.408 0.824 2.397197e-03
## CentreOVIEDO 0.333 0.204 1.395 0.936 2.079 1.016127e-01
## CentreSANTANDER -0.071 0.195 0.932 0.636 1.365 7.161204e-01
## CentreROTTERDAM -2.377 0.152 0.093 0.069 0.125 5.021860e-54
## rs5749131a1 -0.101 0.065 0.904 0.796 1.027 1.224832e-01
## rs5749131a2 -0.209 0.085 0.811 0.686 0.959 1.402907e-02
##
## $rs5749131$Main_effects$rs5749131d
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.624 0.161 1.867 1.361 2.560 1.076672e-04
## SexMale -0.559 0.061 0.572 0.507 0.645 8.643812e-20
## E4statusE4+ 1.086 0.060 2.964 2.633 3.337 9.484831e-71
## Age82>82 0.311 0.061 1.365 1.210 1.540 4.096063e-07
## CentreMADRID -0.835 0.182 0.434 0.304 0.619 4.342755e-06
## CentreNOTTINGHAM -0.508 0.195 0.601 0.410 0.882 9.291370e-03
## CentreOPTIMA -0.539 0.179 0.583 0.410 0.828 2.604900e-03
## CentreOVIEDO 0.337 0.203 1.400 0.940 2.086 9.805458e-02
## CentreSANTANDER -0.067 0.195 0.935 0.638 1.369 7.298276e-01
## CentreROTTERDAM -2.375 0.152 0.093 0.069 0.125 5.561420e-54
## rs5749131d1 -0.130 0.062 0.878 0.778 0.991 3.482203e-02
##
## $rs5749131$Main_effects$rs5749131r
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.572 0.157 1.772 1.302 2.411 2.741695e-04
## SexMale -0.560 0.061 0.571 0.507 0.644 8.536540e-20
## E4statusE4+ 1.085 0.060 2.961 2.630 3.333 1.514202e-70
## Age82>82 0.309 0.061 1.362 1.207 1.536 5.061035e-07
## CentreMADRID -0.850 0.182 0.427 0.299 0.610 2.970276e-06
## CentreNOTTINGHAM -0.512 0.196 0.599 0.409 0.879 8.853409e-03
## CentreOPTIMA -0.553 0.179 0.575 0.405 0.817 2.040860e-03
## CentreOVIEDO 0.329 0.204 1.390 0.933 2.072 1.054753e-01
## CentreSANTANDER -0.079 0.195 0.924 0.631 1.353 6.833261e-01
## CentreROTTERDAM -2.380 0.152 0.093 0.069 0.125 3.669116e-54
## rs5749131r1 -0.149 0.076 0.862 0.743 1.000 4.958275e-02
We can appreciate that the main effects analysis were also performed for the Spanish and Northern European subsets.
names(master_list)
## [1] "All" "N.Eur" "Spain"
Then, selecting a threshold of 0.05, before applying multiple testing correction, we can obtain the following results for the snps under the threshold and to which genes it belongs for the whole dataset.
print_ME(master_list)
##
## rs5749131d1 rs5997711r1 rs11030102d1 rs11030119d1 rs12288512d1 rs7412d1
## 0.035 0.021 0.000 0.004 0.002 0.000
## rs2422493r1 rs4945261d1 rs2373115d1 rs744373a2 rs13022344r1
## 0.045 0.022 0.022 0.000 0.043
##
##
## sig_genes
## BDNF GAB2 TCN2 ABCA1 APOE BIN1 TRAK2
## 3 2 2 1 1 1 1
print_ME(master_list, corrected = T)
##
## rs7412d1 rs744373a2
## 0.010 0.047
##
##
## sig_genes
## APOE BIN1
## 1 1
We really have 3 snp models which are found significantly associated with AD, even after multiple testing correction.
Those are the APOE gene, for which has the E4 variant was found to be the largest known genetic risk factor for late-onset sporadic AD. The rationale is that this isoform APOE ε4 is not as effective as the others at promoting these reactions, resulting in increased vulnerability to AD in individuals with that gene variation
The BDNF snp (rs11030102) has been linked to decreased BDNF serum levels. In the adult brain, BDNF maintains high expression levels and regulates both excitatory and inhibitory synaptic transmission and activity-dependent plasticity (Miranda et al., 2019)
The bridging integrator 1 (BIN1) gene is the second most important susceptibility gene for late-onset Alzheimer’s disease (LOAD) after apolipoprotein E (APOE) gene. (Hu et al., 2021). BIN1 encodes a Myc-interacting protein but its role in AD is still unclear.
To see a count of the number of genes studied, according to the number of snps we can use the following code:
genes_studied <- sapply(master_list$All, function(snp) {snp$Gene})
genes_studied <- unname(genes_studied)
genes_studied <- table(genes_studied)
sort(genes_studied, decreasing = T)
## genes_studied
## MTRR TCN2 SORT1 BDNF SHMT1 RFC1 ABCA1 APOE CYP46A1 GAB2
## 10 10 6 5 5 3 2 2 2 2
## NPC1 NTRK2 PEMT VDR BIN1 CD33 CDK5 DBH HMDX1 HMGCR
## 2 2 2 2 1 1 1 1 1 1
## IL10 LRP1 MTHFD1L NR1H2 PRNP SLC19A1 TRAK2
## 1 1 1 1 1 1 1
Here we can see that from the 75 SNPs studied, all 3 APOE snps were found significant as well as the only SNP studied for BIN1. For BDNF, though just one out of five SNPs was deemed significant according to main effects, snp rs11030102.
To measure the strength of the association found we can extract the OR from the main effects results.
significant_snps <- c("rs7412d1", "rs744373a2")
OR_results <- sapply(significant_snps, function(snp_model_lvl){
snp_model <- sub("[0-2]$", "", snp_model_lvl);
snp <- sub("[a-z]$", "", snp_model);
glm_results <- master_list$All[[snp]]$Main_effects[[snp_model]]
index <- grep(rownames(glm_results), pattern = snp_model_lvl)
glm_results[index,][3]
})
snps <- sub(names(OR_results), pattern = "[a-z][0-2].OR$", replacement = "")
genes <- sapply(snps, function(snp) {list_of_objects[[snp]]$gene})
OR_results2 <- OR_results
names(OR_results2) <- genes
sort(OR_results, decreasing = T)
## rs744373a2.OR rs7412d1.OR
## 1.466 0.693
sort(OR_results2, decreasing = T)
## BIN1 APOE
## 1.466 0.693
On the other hand, the others, hold more moderate effects, with the dominant model of snp rs7412d, which also corresponds to APOE, increases the odds of having the disease 0.682. Or what is the same, decreases it by 1/0.682 = 1.4662757
We can generate a dataframe to plot these results:
plot_df <- data.frame(matrix(NA, nrow = length(significant_snps), ncol = 6))
colnames(plot_df) <- c("snp", "gene", "OR", "lower", "upper", "pvalue")
for (i in seq_along(significant_snps)) {
snp_model_lvl <- significant_snps[i]
snp_model <- sub(snp_model_lvl, pattern = "[0-2]$", replacement = "")
snp <- sub(snp_model, pattern = "[a-z]$", replacement = "")
SNP <- master_list$All[[snp]]
gene <- SNP$Gene
main_effects <- SNP$Main_effects[[snp_model]]
snp_term <- grep(pattern = snp_model_lvl, x = rownames(main_effects))
OR <- main_effects[snp_term, 3]
lower <- main_effects[snp_term, 4]
upper <- main_effects[snp_term, 5]
pvalue <- main_effects[snp_term, 6]
plot_df[i,1] <- snp_model_lvl
plot_df[i,2] <- gene
plot_df[i,3] <- OR
plot_df[i,4] <- lower
plot_df[i,5] <- upper
plot_df[i,6] <- pvalue
}
And plot them:
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.

Covariate interaction
The next step in the analysis would be to examine all possible interactions of the 75 studied SNPs with the covariates Age82, Sex & E4status.
In practice this means repeating the models from the main effects but by adding one interaction term at a time.
In order to reduce the multiple testing performed and due to the previous findings of what the best models were, only the best snp models chosen for each main effect were employed in this subsequent step.
master_list <- perform_analysis(.mode = "interaction", .data = DATASETS, snps = list_of_objects, covariates = variables)
And here we can see an example of the kind of results obtained by doing this analysis.
master_list$All[[1]]$Interactions$Other_covariates
## $Age82
## $Age82$Name
## [1] "rs5749131d1:Age82>82"
##
## $Age82$Summary
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.635 0.166 1.887 1.363 2.613 1.314493e-04
## SexMale -0.559 0.061 0.572 0.507 0.645 8.628824e-20
## E4statusE4+ 1.086 0.060 2.962 2.631 3.335 1.350355e-70
## CentreMADRID -0.834 0.182 0.434 0.304 0.620 4.534412e-06
## CentreNOTTINGHAM -0.507 0.195 0.602 0.411 0.883 9.515261e-03
## CentreOPTIMA -0.538 0.179 0.584 0.411 0.830 2.699116e-03
## CentreOVIEDO 0.338 0.204 1.403 0.941 2.090 9.642355e-02
## CentreSANTANDER -0.065 0.195 0.937 0.640 1.373 7.390780e-01
## CentreROTTERDAM -2.374 0.152 0.093 0.069 0.126 7.240989e-54
## rs5749131d1 -0.148 0.090 0.863 0.723 1.029 1.011110e-01
## Age82>82 0.289 0.103 1.335 1.091 1.633 5.032564e-03
## rs5749131d1:Age82>82 0.034 0.124 1.034 0.812 1.317 7.855417e-01
##
## $Age82$SF
## [1] 1.034
##
## $Age82$Significant
## [1] FALSE
##
##
## $Sex
## $Sex$Name
## [1] "rs5749131d1:SexMale"
##
## $Sex$Summary
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.631 0.165 1.880 1.361 2.597 1.290372e-04
## E4statusE4+ 1.087 0.060 2.964 2.633 3.337 9.654106e-71
## Age82>82 0.311 0.061 1.365 1.210 1.540 4.138155e-07
## CentreMADRID -0.836 0.182 0.433 0.303 0.619 4.274699e-06
## CentreNOTTINGHAM -0.509 0.195 0.601 0.410 0.882 9.251647e-03
## CentreOPTIMA -0.540 0.179 0.583 0.410 0.828 2.576107e-03
## CentreOVIEDO 0.336 0.203 1.399 0.939 2.085 9.848828e-02
## CentreSANTANDER -0.068 0.195 0.934 0.638 1.368 7.258487e-01
## CentreROTTERDAM -2.376 0.152 0.093 0.069 0.125 5.646864e-54
## rs5749131d1 -0.140 0.078 0.870 0.747 1.013 7.208241e-02
## SexMale -0.577 0.106 0.562 0.457 0.691 4.830146e-08
## rs5749131d1:SexMale 0.026 0.127 1.026 0.799 1.317 8.394594e-01
##
## $Sex$SF
## [1] 1.026
##
## $Sex$Significant
## [1] FALSE
##
##
## $E4status
## $E4status$Name
## [1] "rs5749131d1:E4statusE4+"
##
## $E4status$Summary
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.647 0.165 1.909 1.382 2.636 8.710879e-05
## SexMale -0.559 0.061 0.572 0.507 0.645 9.635296e-20
## Age82>82 0.310 0.061 1.363 1.208 1.538 4.797888e-07
## CentreMADRID -0.838 0.182 0.433 0.303 0.618 4.120560e-06
## CentreNOTTINGHAM -0.508 0.195 0.602 0.410 0.882 9.339027e-03
## CentreOPTIMA -0.542 0.179 0.582 0.410 0.827 2.515097e-03
## CentreOVIEDO 0.334 0.203 1.396 0.937 2.080 1.010743e-01
## CentreSANTANDER -0.067 0.195 0.935 0.638 1.370 7.304117e-01
## CentreROTTERDAM -2.376 0.152 0.093 0.069 0.125 5.205773e-54
## rs5749131d1 -0.161 0.078 0.852 0.731 0.991 3.820745e-02
## E4statusE4+ 1.030 0.105 2.802 2.280 3.444 1.636899e-22
## rs5749131d1:E4statusE4+ 0.083 0.127 1.087 0.846 1.395 5.149966e-01
##
## $E4status$SF
## [1] 1.087
##
## $E4status$Significant
## [1] FALSE
First, for the variable “Name” we have the interaction studied, then as “Summary” the full glm model results obtained, and the rest of variables, extract the most interesting results for the interaction, the “SF” and whether it is found “Significant” or not.
The SF is a statistic used to measure interactions in complex diseases. It is defined as the OR of the interaction divided by the product of the OR of each of the interaction on its own.
Thus, it makes a good and easy to understand metric for what we are measuring as if there was no interaction effect we would expect the SF to be close to 1, as the impact of the interaction should be about the same as the effect of each of the members on its on. However, if the interaction is found greater or lower than 1, that means we are having a synergistic effect in which the odds on having the disease is increased or reduced respectively.
Having said that, we can obtain the significant results found for this snp-covariate interaction measured:
cov <- setdiff(variables, c("Centre", "E4status"))
print_INT(master_list, covariates = cov)
##
## rs5997711r1:E4statusE4+ rs13181011r1:SexMale rs326120r1:Age82>82
## 0.038 0.042 0.019
## rs162040d1:E4statusE4+ rs11030102d1:SexMale rs11030119d1:SexMale
## 0.016 0.001 0.023
## rs12288512d1:SexMale rs16961153r1:SexMale rs16961153r1:E4statusE4+
## 0.001 0.025 0.021
## rs6851075r1:SexMale rs7412d1:Age82>82 rs2069442r1:E4statusE4+
## 0.050 0.012 0.005
## rs11754661d1:SexMale
## 0.042
##
##
## [1] "Sex"
## sig_genes
## BDNF MTHFD1L MTRR RFC1 SHMT1
## 3 1 1 1 1
##
##
## [1] "Age82"
## sig_genes
## APOE MTRR
## 1 1
However, after applying multiple testing correction no significant associations are found.
print_INT(master_list, covariates = cov, corrected = T)
## No significant interactions found
SNP-SNP interaction
Reported
Get the SNP-SNP interactions dataset.
snp_snp <- read.csv2(file = file.path(input, "interactions.csv"))
rownames(snp_snp) <- NULL
Separate the datasets into 2, one with the APOE interactions and the other with the rest.
snp_APOE <- snp_snp[snp_snp$gene2 == "APOE",]
snp_snp <- snp_snp[snp_snp$gene2 != "APOE",]
Recode the model into a format "rs[0-9]*[a,d,r]", and replace unknown with the best model found for main effects.
snp_APOE$model1 <- get_model(snp_APOE$snp1, snp_APOE$model1, master_list)
snp_APOE$model2 <- "E4status"
snp_snp$model1 <- get_model(snp_snp$snp1, snp_snp$model1, master_list)
snp_snp$model2 <- get_model(snp_snp$snp2, snp_snp$model2, master_list)
Get a list of the interactions found in it.
list_of_snp_pairs <- generate_list_of_snp_pairs(mode = "reported_model",
snp_df = snp_snp)
list_of_APOE <- generate_list_of_snp_pairs(mode = "reported_model",
snp_df = snp_APOE)
list_of_snp_pairs
## [[1]]
## [1] "rs13022344d" "rs597668d"
##
## [[2]]
## [1] "rs13022344d" "rs744373d"
##
## [[3]]
## [1] "rs2071746d" "rs2695121r"
##
## [[4]]
## [1] "rs1800977r" "rs3931914d"
##
## [[5]]
## [1] "rs2422493r" "rs3931914d"
##
## [[6]]
## [1] "rs2422493r" "rs4800488r"
##
## [[7]]
## [1] "rs2422493r" "rs2236707r"
##
## [[8]]
## [1] "rs7975232r" "rs1800896d"
##
## [[9]]
## [1] "rs731236d" "rs1800896d"
##
## [[10]]
## [1] "rs7157609r" "rs4900442r"
list_of_APOE
## [[1]]
## [1] "rs2069442r" "E4status"
##
## [[2]]
## [1] "rs1800977r" "E4status"
##
## [[3]]
## [1] "rs597668d" "E4status"
##
## [[4]]
## [1] "rs744373d" "E4status"
##
## [[5]]
## [1] "rs1799990r" "E4status"
##
## [[6]]
## [1] "rs2373115d" "E4status"
Creating the interactions containers in the master_list
master_list <- store_interactions(snp_dataset = snp_snp, master_list = master_list)
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list
Performing the analysis
master_list <- perform_analysis(.mode = "snp",
.submode = "reported_model",
.data = DATASETS,
snps = list_of_snp_pairs,
covariates = variables,
.verbose = T)
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
Printing the results
print_SNP_SNP(master_list, list_of_snp_pairs, list_of_APOE)
##
## rs7157609r1:rs4900442r1 rs2069442r1:E4statusE4+
## 0.000 0.005
##
##
## sig_genes
## CDK5-APOE CYP46A1-CYP46A1
## 1 1
print_SNP_SNP(master_list, list_of_snp_pairs, list_of_APOE, corrected = T)
##
## rs7157609r1:rs4900442r1
## 0.001
##
##
## CYP46A1-CYP46A1
## 1
We can plot this interaction but by center.
CYP46A1_interaction <- vector(mode = "list", length = length(levels(All$Centre)))
for (centre in str_to_title(levels(All$Centre))) {
snp1_model <- "rs7157609r"
snp2_model <- "rs4900442r"
covariates <- c("Sex", "Age82", "E4status")
interaction <- paste0(snp1_model, "*", snp2_model)
predictors <- c(covariates, interaction)
formula <- as.formula(paste("Diag", paste(predictors, collapse = " + "), sep = " ~ "))
args <- list(formula = formula, family = quasibinomial("logit"), data = as.name(centre))
linear_model <- do.call(glm, args)
OR_summary <- Coeff.OR2(linear_model)
cat("\n")
print(centre)
print(OR_summary)
i <- which(str_to_title(levels(All$Centre)) == centre)
CYP46A1_interaction[[i]] <- OR_summary
names(CYP46A1_interaction)[i] <- centre
}
##
## [1] "Bristol"
## Estimate Std. Error OR lower upper
## (Intercept) -0.03045 0.35927 0.97001 0.47969 1.96151
## SexMale -0.18704 0.34851 0.82941 0.41890 1.64221
## Age82>82 0.02347 0.35298 1.02375 0.51254 2.04484
## E4statusE4+ 2.64817 0.40538 14.12820 6.38298 31.27162
## rs7157609r1 -0.63006 0.78831 0.53256 0.11359 2.49686
## rs4900442r1 -0.00007 0.50460 0.99993 0.37192 2.68837
## rs7157609r1:rs4900442r1 0.80337 1.14392 2.23305 0.23723 21.01943
## Pr(>|t|)
## (Intercept) 9.325336e-01
## SexMale 5.920071e-01
## Age82>82 9.470403e-01
## E4statusE4+ 4.104252e-10
## rs7157609r1 4.249711e-01
## rs4900442r1 9.998895e-01
## rs7157609r1:rs4900442r1 4.832051e-01
##
## [1] "Madrid"
## Estimate Std. Error OR lower upper
## (Intercept) -0.34089 0.17543 0.71114 0.50423 1.00296
## SexMale -0.31897 0.23052 0.72690 0.46265 1.14208
## Age82>82 -0.30332 0.25176 0.73836 0.45079 1.20940
## E4statusE4+ 1.69970 0.25111 5.47233 3.34523 8.95198
## rs7157609r1 -0.33175 0.63237 0.71767 0.20780 2.47860
## rs4900442r1 -0.39936 0.36058 0.67075 0.33085 1.35985
## rs7157609r1:rs4900442r1 0.98453 0.90893 2.67654 0.45069 15.89529
## Pr(>|t|)
## (Intercept) 5.272160e-02
## SexMale 1.672540e-01
## Age82>82 2.290165e-01
## E4statusE4+ 4.841628e-11
## rs7157609r1 6.001507e-01
## rs4900442r1 2.687462e-01
## rs7157609r1:rs4900442r1 2.794075e-01
##
## [1] "Nottingham"
## Estimate Std. Error OR lower upper
## (Intercept) -0.41956 0.31418 0.65734 0.35510 1.21682
## SexMale -0.41392 0.28571 0.66105 0.37760 1.15728
## Age82>82 0.70019 0.28802 2.01414 1.14531 3.54204
## E4statusE4+ 1.68982 0.29524 5.41850 3.03786 9.66474
## rs7157609r1 -0.91923 0.66836 0.39883 0.10761 1.47811
## rs4900442r1 -0.66756 0.43288 0.51296 0.21959 1.19828
## rs7157609r1:rs4900442r1 2.54708 0.96496 12.76970 1.92660 84.63869
## Pr(>|t|)
## (Intercept) 1.828825e-01
## SexMale 1.485830e-01
## Age82>82 1.571232e-02
## E4statusE4+ 2.797494e-08
## rs7157609r1 1.701788e-01
## rs4900442r1 1.242252e-01
## rs7157609r1:rs4900442r1 8.789293e-03
##
## [1] "Optima"
## Estimate Std. Error OR lower upper
## (Intercept) -0.62762 0.23284 0.53386 0.33824 0.84262
## SexMale -0.14355 0.22565 0.86628 0.55665 1.34814
## Age82>82 0.30978 0.23067 1.36313 0.86734 2.14231
## E4statusE4+ 2.06097 0.23308 7.85357 4.97351 12.40140
## rs7157609r1 -1.37584 0.60253 0.25263 0.07755 0.82292
## rs4900442r1 -0.31941 0.29480 0.72658 0.40770 1.29486
## rs7157609r1:rs4900442r1 2.74865 0.77739 15.62147 3.40406 71.68798
## Pr(>|t|)
## (Intercept) 7.311692e-03
## SexMale 5.250180e-01
## Age82>82 1.800036e-01
## E4statusE4+ 2.613102e-17
## rs7157609r1 2.290212e-02
## rs4900442r1 2.792225e-01
## rs7157609r1:rs4900442r1 4.518687e-04
##
## [1] "Oviedo"
## Estimate Std. Error OR lower upper
## (Intercept) 0.52828 0.22416 1.69601 1.09299 2.63172
## SexMale -0.12607 0.29733 0.88156 0.49222 1.57885
## Age82>82 3.11393 1.03902 22.50927 2.93719 172.50096
## E4statusE4+ 0.74632 0.32886 2.10922 1.10711 4.01840
## rs7157609r1 -0.94419 0.79781 0.38900 0.08144 1.85804
## rs4900442r1 0.28415 0.45425 1.32863 0.54543 3.23642
## rs7157609r1:rs4900442r1 1.51544 1.20898 4.55142 0.42564 48.66856
## Pr(>|t|)
## (Intercept) 0.019152615
## SexMale 0.671902012
## Age82>82 0.002979355
## E4statusE4+ 0.024029937
## rs7157609r1 0.237658874
## rs4900442r1 0.532144484
## rs7157609r1:rs4900442r1 0.211108446
##
## [1] "Santander"
## Estimate Std. Error OR lower upper
## (Intercept) 0.68813 0.23384 1.98999 1.25836 3.14702
## SexMale -0.25456 0.28765 0.77526 0.44116 1.36239
## Age82>82 -0.95175 0.28291 0.38606 0.22174 0.67217
## E4statusE4+ 1.77570 0.32654 5.90440 3.11329 11.19779
## rs7157609r1 0.28540 0.88403 1.33029 0.23520 7.52399
## rs4900442r1 -0.38858 0.44284 0.67802 0.28464 1.61509
## rs7157609r1:rs4900442r1 -1.88128 1.41850 0.15239 0.00945 2.45708
## Pr(>|t|)
## (Intercept) 3.511210e-03
## SexMale 3.769086e-01
## Age82>82 8.693889e-04
## E4statusE4+ 1.132204e-07
## rs7157609r1 7.470494e-01
## rs4900442r1 3.809449e-01
## rs7157609r1:rs4900442r1 1.857830e-01
##
## [1] "Rotterdam"
## Estimate Std. Error OR lower upper
## (Intercept) -1.72576 0.07470 0.17804 0.15379 0.20611
## SexMale -0.73033 0.07818 0.48175 0.41331 0.56153
## Age82>82 0.42131 0.07486 1.52395 1.31598 1.76480
## E4statusE4+ 0.77592 0.07450 2.17259 1.87742 2.51416
## rs7157609r1 -0.24360 0.19420 0.78380 0.53568 1.14686
## rs4900442r1 -0.12362 0.10507 0.88371 0.71923 1.08581
## rs7157609r1:rs4900442r1 0.50423 0.25059 1.65571 1.01316 2.70578
## Pr(>|t|)
## (Intercept) 7.898817e-113
## SexMale 1.343549e-20
## Age82>82 1.914210e-08
## E4statusE4+ 3.604039e-25
## rs7157609r1 2.097575e-01
## rs4900442r1 2.394325e-01
## rs7157609r1:rs4900442r1 4.425165e-02
#PLOT
forest_plot <- as.data.frame(matrix(data = NA, nrow = length(CYP46A1_interaction), ncol = 6))
colnames(forest_plot) <- c("SF", "lower", "upper", "p_value", "Centre", "gene")
for (i in seq_along(CYP46A1_interaction)) {
results <- CYP46A1_interaction[[i]]
results <- tail(results, n = 1)
forest_plot[i, 1] <- results[3]
forest_plot[i, 2] <- results[4]
forest_plot[i, 3] <- results[5]
forest_plot[i, 4] <- as.character(ifelse(results[6] >= 0.005, round(results[6], digits = 2),"< 0.005"))
forest_plot[i, 5] <- names(CYP46A1_interaction)[i]
forest_plot[i, 6] <- "CYP46A1*CYP46A1"
}
p <- ggplot(data = forest_plot, aes(x = SF, y = Centre, xmin = lower, xmax = upper))
p <- p + geom_pointrange(aes(col = Centre))
p <- p + geom_vline(aes(fill = Centre), xintercept = 1, linetype = "dotted")
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.
p <- p + geom_errorbar(aes(xmin = lower, xmax = upper, col = Centre),width = 0.5, cex = 1)
p <- p + facet_wrap(~gene, strip.position = "left", nrow = 1, scales = "free_y")
p <- p + geom_text(aes(label = p_value), hjust = -0.5, vjust = -1)
p <- p+theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p

Doing the same but without Optima to ascertain that the signal doesn’t come only from there.
no_optima <- All[All$Centre != "NOTTINGHAM",]
formula <- as.formula(Diag ~ Sex + Centre + Age82 + E4status + rs7157609r * rs4900442r)
linear_model <- glm(formula = formula, family = quasibinomial("logit"), data = no_optima)
OR_summary <- Coeff.OR2(linear_model)
OR_summary
## Estimate Std. Error OR lower upper
## (Intercept) 0.55383 0.15831 1.73990 1.27574 2.37292
## SexMale -0.55980 0.06298 0.57133 0.50498 0.64639
## CentreMADRID -0.80888 0.18191 0.44536 0.31179 0.63614
## CentreOPTIMA -0.52162 0.17957 0.59356 0.41746 0.84394
## CentreOVIEDO 0.34359 0.20456 1.41000 0.94427 2.10542
## CentreSANTANDER -0.05130 0.19550 0.95000 0.64760 1.39360
## CentreROTTERDAM -2.35592 0.15292 0.09481 0.07025 0.12794
## Age82>82 0.29592 0.06307 1.34436 1.18805 1.52124
## E4statusE4+ 1.06714 0.06211 2.90704 2.57384 3.28337
## rs7157609r1 -0.38552 0.16625 0.68010 0.49097 0.94206
## rs4900442r1 -0.14805 0.08843 0.86239 0.72516 1.02559
## rs7157609r1:rs4900442r1 0.74107 0.21558 2.09817 1.37510 3.20146
## Pr(>|t|)
## (Intercept) 4.711384e-04
## SexMale 7.721177e-19
## CentreMADRID 8.857666e-06
## CentreOPTIMA 3.685038e-03
## CentreOVIEDO 9.306382e-02
## CentreSANTANDER 7.930341e-01
## CentreROTTERDAM 1.018479e-52
## Age82>82 2.751747e-06
## E4statusE4+ 7.107484e-65
## rs7157609r1 2.042292e-02
## rs4900442r1 9.412707e-02
## rs7157609r1:rs4900442r1 5.904397e-04
APOE4_CDK5_interaction <- vector(mode = "list", length = length(levels(All$Centre))-1)
for (centre in str_to_title(levels(All$Centre))) {
if (centre == "Bristol"){
next
}
snp1_model <- "rs2069442r"
snp2_model <- "E4status"
covariates <- c("Sex", "Age82")
interaction <- paste0(snp1_model, "*", snp2_model)
predictors <- c(covariates, interaction)
formula <- as.formula(paste("Diag", paste(predictors, collapse = " + "), sep = " ~ "))
args <- list(formula = formula, family = quasibinomial("logit"), data = as.name(centre))
linear_model <- do.call(glm, args)
OR_summary <- Coeff.OR2(linear_model)
cat("\n")
print(centre)
print(OR_summary)
i <- which(str_to_title(levels(All$Centre)) == centre)
APOE4_CDK5_interaction[[i]] <- OR_summary
names(APOE4_CDK5_interaction)[i] <- centre
}
##
## [1] "Madrid"
## Estimate Std. Error OR lower upper
## (Intercept) -0.42209 0.17044 0.65567 0.46947 0.91574
## SexMale -0.29434 0.22879 0.74502 0.47580 1.16658
## Age82>82 -0.26994 0.25011 0.76342 0.46760 1.24641
## rs2069442r1 0.56138 0.65398 1.75309 0.48655 6.31656
## E4statusE4+ 1.73354 0.25833 5.66067 3.41171 9.39212
## rs2069442r1:E4statusE4+ -1.01309 0.99437 0.36309 0.05171 2.54944
## Pr(>|t|)
## (Intercept) 1.369528e-02
## SexMale 1.990187e-01
## Age82>82 2.811197e-01
## rs2069442r1 3.911940e-01
## E4statusE4+ 6.890021e-11
## rs2069442r1:E4statusE4+ 3.089200e-01
##
## [1] "Nottingham"
## Estimate Std. Error OR lower upper
## (Intercept) -0.46658 0.30258 0.62714 0.34658 1.13481
## SexMale -0.51693 0.28392 0.59635 0.34184 1.04034
## Age82>82 0.70145 0.28993 2.01668 1.14245 3.55987
## rs2069442r1 0.94480 0.61158 2.57229 0.77577 8.52918
## E4statusE4+ 1.74744 0.30447 5.73990 3.16035 10.42495
## rs2069442r1:E4statusE4+ -1.62042 1.15742 0.19782 0.02047 1.91194
## Pr(>|t|)
## (Intercept) 1.242538e-01
## SexMale 6.977532e-02
## Age82>82 1.621992e-02
## rs2069442r1 1.235732e-01
## E4statusE4+ 2.584897e-08
## rs2069442r1:E4statusE4+ 1.626691e-01
##
## [1] "Optima"
## Estimate Std. Error OR lower upper
## (Intercept) -0.61016 0.21925 0.54326 0.35349 0.83491
## SexMale -0.14485 0.22538 0.86515 0.55621 1.34569
## Age82>82 0.25175 0.22990 1.28627 0.81967 2.01849
## rs2069442r1 -0.38795 0.69919 0.67845 0.17233 2.67103
## E4statusE4+ 2.04259 0.24127 7.71057 4.80522 12.37255
## rs2069442r1:E4statusE4+ 0.27336 0.92036 1.31437 0.21642 7.98259
## Pr(>|t|)
## (Intercept) 5.630354e-03
## SexMale 5.207830e-01
## Age82>82 2.741204e-01
## rs2069442r1 5.792908e-01
## E4statusE4+ 4.318149e-16
## rs2069442r1:E4statusE4+ 7.666016e-01
##
## [1] "Oviedo"
## Estimate Std. Error OR lower upper
## (Intercept) 0.52487 0.20923 1.69024 1.12162 2.54712
## SexMale -0.13342 0.28687 0.87510 0.49873 1.53548
## Age82>82 3.11450 1.01710 22.52221 3.06788 165.34228
## rs2069442r1 1.03470 0.79510 2.81427 0.59233 13.37116
## E4statusE4+ 0.79968 0.33571 2.22483 1.15222 4.29593
## rs2069442r1:E4statusE4+ -1.20038 1.17202 0.30108 0.03027 2.99449
## Pr(>|t|)
## (Intercept) 0.012698852
## SexMale 0.642233903
## Age82>82 0.002415291
## rs2069442r1 0.194227999
## E4statusE4+ 0.017895591
## rs2069442r1:E4statusE4+ 0.306641092
##
## [1] "Santander"
## Estimate Std. Error OR lower upper
## (Intercept) 0.61744 0.22975 1.85418 1.18192 2.90881
## SexMale -0.26664 0.28592 0.76595 0.43734 1.34146
## Age82>82 -0.91936 0.28094 0.39877 0.22993 0.69162
## rs2069442r1 -0.12380 0.74260 0.88356 0.20612 3.78747
## E4statusE4+ 1.76129 0.32953 5.81996 3.05082 11.10255
## rs2069442r1:E4statusE4+ -0.17215 1.36944 0.84185 0.05748 12.32899
## Pr(>|t|)
## (Intercept) 7.602214e-03
## SexMale 3.517886e-01
## Age82>82 1.191066e-03
## rs2069442r1 8.677114e-01
## E4statusE4+ 1.798408e-07
## rs2069442r1:E4statusE4+ 9.000454e-01
##
## [1] "Rotterdam"
## Estimate Std. Error OR lower upper
## (Intercept) -1.76515 0.07368 0.17116 0.14814 0.19775
## SexMale -0.72833 0.07818 0.48271 0.41414 0.56265
## Age82>82 0.41937 0.07483 1.52101 1.31351 1.76128
## rs2069442r1 0.33548 0.15737 1.39861 1.02740 1.90394
## E4statusE4+ 0.82213 0.07683 2.27534 1.95725 2.64512
## rs2069442r1:E4statusE4+ -0.77878 0.32055 0.45897 0.24486 0.86028
## Pr(>|t|)
## (Intercept) 8.483853e-121
## SexMale 1.693239e-20
## Age82>82 2.191982e-08
## rs2069442r1 3.306694e-02
## E4statusE4+ 1.826176e-26
## rs2069442r1:E4statusE4+ 1.515102e-02
APOE4_CDK5_interaction <- rlist::list.clean(APOE4_CDK5_interaction, recursive = T)
#plot
forest_plot <- as.data.frame(matrix(data = NA, nrow = length(APOE4_CDK5_interaction), ncol = 6))
colnames(forest_plot) <- c("SF", "lower", "upper", "p_value", "Centre", "gene")
for (i in seq_along(APOE4_CDK5_interaction)) {
results <- APOE4_CDK5_interaction[[i]]
results <- tail(results, n=1)
forest_plot[i, 1] <- results[3]
forest_plot[i, 2] <- results[4]
forest_plot[i, 3] <- results[5]
forest_plot[i, 4] <- round(results[6], digits = 2)
forest_plot[i, 5] <- names(APOE4_CDK5_interaction)[i]
forest_plot[i, 6] <- "APOE4*CDK5"
}
p <- ggplot(data = forest_plot, aes(x = SF, y = Centre, xmin = lower, xmax = upper))
p <- p + geom_pointrange(aes(col = Centre))
p <- p + geom_vline(aes(fill = Centre), xintercept = 1, linetype = "dotted")
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.
p <- p + geom_errorbar(aes(xmin = lower, xmax = upper, col = Centre),width = 0.5, cex = 1)
p <- p + facet_wrap(~gene, strip.position = "left", nrow = 1, scales = "free_y")
p <- p + geom_text(aes(label = round(p_value, 3)),hjust = -0.5, vjust = -1)
p <- p+theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p
